"""
This file creates the member-level data used in The Deaths of Ideas in Congress to assess if deaths predict introductions
First created: 12/3/23
Final edit: 12/18/23
"""
import os
import pandas as pd
import numpy as np
full_data=pd.read_csv('/replication/deathideas_data.csv')
#Group by member
full_data = full_data.drop_duplicates(subset=['section']) #Remove duplicate sections
full_data2 = full_data.groupby(["icpsr", "Congress"])['ideodist_floormedian', 'chref', 'rankref', 'majority', 'les_benchmarklag', 'senate', 'bills_sponsored', 'seniority', 'change_divgov', 'change_gridlockpp', 'committee_leader', 'party_leader'].first()
full_data2 = full_data2.reset_index(level=['icpsr', 'Congress'])

#Create counts of lagged deaths in Congress
mc_data = full_data.groupby(['icpsr', 'Congress'])['death'].sum()
mc_data = mc_data.reset_index(level=['icpsr', 'Congress'])
mc_data['deaths_lag']= mc_data.groupby('icpsr')['death'].shift(periods=1) 

#Create number of introduced sections and count of number of reintroduced sections
#Import all sections
cong=[103,104,105,106,107,108, 109, 110, 111, 112, 113, 114]
secs_all = pd.DataFrame()
reintro = pd.DataFrame()
enacted = pd.DataFrame()

for i in cong:
   #Bring in all sections 
   num = str(i)
   new_secs = pd.read_csv('/replication/Cleaned Sections/'+num+'/'+num+'_sections_noboilerplate.csv')
   secs_all = secs_all.append(new_secs)
   secs_all = secs_all.drop(['Unnamed: 0'], axis=1)
   new_secs=[]
   secs_all = secs_all.loc[(secs_all['bill'].str.contains("ih", na=False)) | (secs_all['bill'].str.contains("is", na=False))]

    #Count reintroduced sections
   os.chdir('/replication/Reintroduced')
   new_reintro=pd.read_csv('secs'+num+'_reintroduced.csv')
   new_reintro.loc[(new_reintro['txt1_ad_5']==1) & (new_reintro['txt2_ad_5']==1) & (new_reintro['perblock_txt1']==1) & (new_reintro['perblock_txt2']==1) & (new_reintro['first_txt1_ad_2']==1) & (new_reintro['first_txt2_ad_2']==1) , 'identical']=1
   new_reintro.loc[(new_reintro['txt1_ad_5']>=0.95) & (new_reintro['txt2_ad_5']>=0.95) & (new_reintro['perblock_txt1']>=0.95) & (new_reintro['perblock_txt2']>=0.95) & (new_reintro['first_txt1_ad_2']>=0.95) & (new_reintro['first_txt2_ad_2']>=0.95) , 'near_identical']=1
   new_reintro.loc[(new_reintro['txt1_ad_5']>=0.80) & (new_reintro['txt2_ad_5']>=0.80) & (new_reintro['perblock_txt1']>=0.80) & (new_reintro['perblock_txt2']>=0.80) & (new_reintro['first_txt1_ad_2']>=0.80) & (new_reintro['first_txt2_ad_2']>=0.80) , 'robustnear_identical']=1

   new_reintro["identical"] = np.where(new_reintro["identical"] >0, 1, 0)
   new_reintro["near_identical"] = np.where(new_reintro["near_identical"] >0, 1, 0)
   new_reintro["robustnear_identical"] = np.where(new_reintro["robustnear_identical"] >0, 1, 0)

   reintro_secs = new_reintro.groupby(new_reintro['sec1'])[('match', 'identical', 'near_identical', 'robustnear_identical')].sum()
   reintro_secs['section'] = reintro_secs.index    
   reintro_secs=reintro_secs.rename(columns={"match":"reintro_match", "identical":'reintro_identical', 'near_identical':'reintro_near_identical', 'robustnear_identical':'reintro_robustnear_identical'})
   reintro = reintro.append(reintro_secs, sort=True)
   
   os.chdir('/replication/Enacted Sections')
   new_enacted=pd.read_csv('secs'+num+'_enacted.csv')
   #Collapse sections into single file for each sec
   enacted_secs = new_enacted.groupby(new_enacted['sec1']).count()
   enacted_secs['section'] = enacted_secs.index    
   enacted_secs= enacted_secs[['section', 'match']]
   enacted_secs=enacted_secs.rename(columns={"match":"enacted_match"})
   enacted = enacted.append(enacted_secs, sort=True)
   
   #Create df of unenacted sections
   unenacted = pd.merge(secs_all, enacted, on='section', how='left')
   unenacted = unenacted[unenacted['enacted_match'].isnull()]

   #Create DV of reintroduced or not
   dv = pd.merge(unenacted, reintro, on='section', how='left')

   #Create DV column
   dv['reintroduced']=dv['reintro_match']
   dv['reintro_match'] = dv['reintro_match'].fillna(0)
   dv['reintroduced'] = dv['reintroduced'].fillna(0)
   dv['reintroduced'] = dv['reintroduced'].astype(int)
   dv["reintroduced"] = np.where(dv["reintroduced"] >0, 1, 0)
   dv["death"] = np.where(dv["reintroduced"] >0, 0, 1)
   dv["reintro_identical"] = np.where(dv["reintro_identical"] >0, 1, 0)
   dv["reintro_near_identical"] = np.where(dv["reintro_near_identical"] >0, 1, 0)
   dv["reintro_robustnear_identical"] = np.where(dv["reintro_robustnear_identical"] >0, 1, 0)
   dv['billid']=dv['section'].str.split('_').str[0]

dv.loc[dv['clean_text'].str.contains("harmonized tariff", na=False), 'tariff']=1
dv.loc[dv['clean_text'].str.contains("facility united states postal service", na=False), 'post_office']=1
dv.loc[dv['clean_text'].str.contains("coin specifications", na=False), 'coins']=1
dv.loc[dv['clean_text'].str.contains("gold medal", na=False), 'gold_medal']=1
dv.loc[dv['clean_text'].str.contains("known designated", na=False), 'facility']=1
dv['facility'] = dv['facility'].fillna(0)
dv['coins'] = dv['coins'].fillna(0)
dv['tariff'] = dv['tariff'].fillna(0)
dv['post_office'] = dv['post_office'].fillna(0)
dv['gold_medal'] = dv['gold_medal'].fillna(0)
dv = dv[(dv.tariff != 1)] 
dv = dv[(dv.post_office != 1)]
dv = dv[(dv.coins != 1)]
dv = dv[(dv.gold_medal != 1)]
dv = dv[(dv.facility != 1)]

#Remove Appropriations bills
approps=pd.read_excel('/replication/appropriations_bills.xlsx')
dv = pd.merge(dv, approps, on='billid', how='left')
dv["approps"] = np.where(dv["approps"] >0, 1, 0)
dv= dv[(dv.approps != 1)]

dv['sec_count'] = 1
mc_data2 = dv.groupby(['bill'])['sec_count', 'reintroduced', 'reintro_identical', 'reintro_near_identical'].sum()
mc_data2 = mc_data2.reset_index(level=['bill'])
mc_data2['billid'] = mc_data2.bill.str[:-2]
mc_data2['new_intro'] = mc_data2['sec_count']-mc_data2['reintroduced']
mc_data2['new_intro_identical'] = mc_data2['sec_count']-mc_data2['reintro_identical']
mc_data2['new_intro_nearidentical'] = mc_data2['sec_count']-mc_data2['reintro_near_identical']

#Get sponsor and policy agendas code for each bill
cbp=pd.read_csv('/replication/bills102-114.csv')
cbp = cbp.reset_index(drop=True)
cbp_slim = cbp[['billid', 'pooleid', 'cong']] 
cbp_slim['billid'] = cbp_slim['billid'].str.replace('-', '')
cbp_slim['billid'] = cbp_slim['billid'].str.lower()
cbp_slim=cbp_slim.rename(columns={"pooleid":"icpsr", 'cong':'Congress'})

mc_data3 = pd.merge(cbp_slim, mc_data2, on="billid", how="right")
mc_data3 = mc_data3.groupby(['icpsr', 'Congress'])['new_intro', 'new_intro_identical', 'new_intro_nearidentical', 'sec_count'].sum()
mc_data3 = mc_data3.reset_index(level=['icpsr', 'Congress'])
mc_data3=mc_data3.rename(columns={"sec_count":"total_introduced"})

mem_data = pd.merge(mc_data, mc_data3, on=["icpsr", "Congress"], how="inner")
member_data = pd.merge(mem_data, full_data2, on=['icpsr', "Congress"], how='left')

member_data.to_csv(r'\replication\member_intros_deaths.csv')